The paper presenting this data can be accessed at https://pubmed.ncbi.nlm.nih.gov/31078527/ and the samples can be found on GEO at https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE117896

The 0h and 12h samples are used here and there are 2 biological replicates for each (SRR7624365,SRR7624366 and SRR7624371,SRR7624372 respectively)

Installing and loading the required packages

Installing CRAN packages

cran.packages = c("tidyr",
                  "ggplot2",
                  "ggrepel",
                  "gridExtra",
                  "NMF",
                  "GGally",
                  "gprofiler2",
                  "UpSetR"
)

new.packages <- cran.packages[!(cran.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

Installing Bioconductor packages

bioconductor.packages = c("preprocessCore",
                           "edgeR",
                           "DESeq2")

if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
BiocManager::install(update = FALSE,ask=FALSE)
new.bio.packages<-bioconductor.packages[!(bioconductor.packages %in% installed.packages()[,"Package"])]
if(length(new.bio.packages)) BiocManager::install(new.bio.packages)

Loading packages

library(tidyr)
library(ggplot2); theme_set(theme_bw())
library(preprocessCore)
library(ggrepel)
library(gridExtra)
library(edgeR)
library(DESeq2)
library(NMF)
library(GGally)
library(gprofiler2)
library(UpSetR)

Creating the count matrix from featureCounts output

meta = data.frame(id=c('SRR7624365','SRR7624366','SRR7624371','SRR7624372'),
                  rep=c(1,2,1,2),
                  timepoint=c('0h','0h','12h','12h')
    )
rownames(meta)=meta$id
meta$timepoint = as.factor(meta$timepoint)
listofsamples = paste0(meta$id,'_counts.txt')
print(listofsamples)
## [1] "SRR7624365_counts.txt" "SRR7624366_counts.txt" "SRR7624371_counts.txt"
## [4] "SRR7624372_counts.txt"

Look at first sample

first.sample=read.csv(listofsamples[1],sep='\t',skip=1)
str(first.sample)
## 'data.frame':    55401 obs. of  7 variables:
##  $ Geneid                     : Factor w/ 55401 levels "ENSMUSG00000000001",..: 40483 18219 15180 40626 41108 41693 40787 30574 40948 40897 ...
##  $ Chr                        : Factor w/ 3076 levels "1","1;1","1;1;1",..: 1 1 7 1 1 1 1 2 1 1 ...
##  $ Start                      : Factor w/ 55378 levels "1","100006021",..: 24191 24357 24907 25161 25751 25805 26365 26381 26649 26759 ...
##  $ End                        : Factor w/ 55382 levels "100006584","100008347;100006647;100010119;100017090;100017090;100020981;100020981;100022108;100022108;100022108;100023945;1"| __truncated__,..: 24194 24354 24909 25161 25765 25813 26382 26378 26658 26761 ...
##  $ Strand                     : Factor w/ 601 levels "-","-;-","-;-;-",..: 299 299 7 299 1 1 1 300 1 299 ...
##  $ Length                     : int  1070 110 6094 480 2819 2233 2309 250 2057 926 ...
##  $ SRR7624365__Aligned.out.bam: int  1 0 3 0 0 0 0 0 0 0 ...
summary(first.sample$SRR7624365__Aligned.out.bam)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##      0.0      0.0      1.0    832.2     42.0 735074.0
head(first.sample)
head(first.sample[,c(1,7)])

Create count matrix

cts = data.frame(gene_id = read.csv(listofsamples[1],sep='\t',skip=1)[,1])
rownames(cts) = cts$gene_id

for (i in meta$id){
  print(i)
  currentfile = read.csv(paste(i,'_counts.txt',sep=''),sep='\t',skip=1) ##read each sample
  cts[,i]=currentfile[,7] ##take expression column and add to count matrix
}
## [1] "SRR7624365"
## [1] "SRR7624366"
## [1] "SRR7624371"
## [1] "SRR7624372"
cts = subset(cts,select=-c(gene_id))

cts = cts[rowSums(cts) > 0,] ##get rid of zero expression genes
head(cts)

Initial QC

Visualising distribution of abundance per sample

qc.plots<-function(cts,title){
cts.tidy = pivot_longer(cts, cols=colnames(cts), names_to='sample', values_to='expression')
# we now remove 0 counts to create more understandable visualizations
cts.tidy$expression[cts.tidy$expression == 0] = NA
cts.tidy$log.expression=log2(cts.tidy$expression)

a<-ggplot(drop_na(cts.tidy), aes(x=log.expression, color=sample)) +
  geom_density(alpha=0.3)+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+ 
  theme(plot.title = element_text(size=10))
b<-ggplot(drop_na(cts.tidy), aes(x=sample, y=log.expression,color=sample)) +
  geom_violin(alpha=0.3) + 
  theme(legend.position = "none")+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+ 
  theme(plot.title = element_text(size=10))+
  theme(axis.text.x=element_blank())
c<-ggplot(drop_na(cts.tidy), aes(x=sample, y=log.expression,color=sample)) +
  geom_boxplot(alpha=0.3) + 
  theme(legend.position = "none")+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+ 
  theme(plot.title = element_text(size=10))+
  theme(axis.text.x=element_blank())
  grid.arrange(a,arrangeGrob(b,c),nrow=1,top=title,widths=2:1)
}
qc.plots(cts,'Unfiltered and unnormalised')

ggpairs

ggpairs(cts)

Comparing replicates

grid.arrange(ggplot(cts, aes(x=SRR7624365, y=SRR7624366)) + 
        geom_point() + 
        xlab('0h rep 1 abundance') + 
        ylab('0h rep 2 abundance'),
        ggplot(cts, aes(x=log2(SRR7624365), y=log2(SRR7624366))) + 
        geom_point() + 
        xlab('0h rep 1 log2(abundance)') + 
        ylab('0h rep 2 log2(abundance'),nrow=1,top='0h rep vs rep')

grid.arrange(ggplot(cts, aes(x=SRR7624371, y=SRR7624372)) + 
        geom_point() + 
        xlab('12h rep 1 abundance') + 
        ylab('12h rep 2 abundance'),
        ggplot(cts, aes(x=log2(SRR7624371), y=log2(SRR7624372))) + 
        geom_point() + 
        xlab('12h rep 1 log2(abundance)') + 
        ylab('12h rep 2 log2(abundance'),nrow=1,top='12h rep vs rep')

MA plots

ma.plot = function(counts, meta, i, j, lower.lim=NA, upper.lim=NA, log.transformed=FALSE){
  main.title = paste0(meta$id[i], ' v ', meta$id[j])
  sub.title = paste(meta$timepoint[i], meta$rep[i], 'v',
                    meta$timepoint[j], meta$rep[j])
  
  # if already log transformed we don't need to do it again
  if (log.transformed == TRUE){
    l1 = counts[,i]
    l2 = counts[,j]
  } else {
    # mask away the zeros
    zero.mask = !(counts[,i] == 0 | counts[,j] == 0)
    l1 = log2(counts[zero.mask, i])
    l2 = log2(counts[zero.mask, j])
  }
  
  #calculate M and A
  m = l1 - l2
  a = 0.5 * (l1 + l2)
  data = data.frame(A = a, M = m)
  
  #define MA plot
  p = ggplot(data=data, aes(x=A, y=M, color='red', fill='red')) +
    geom_point(alpha=0.05)+
    theme(legend.position = "none") + 
    geom_hline(yintercept=0.5,colour='gray60')+
    geom_hline(yintercept=-0.5,colour='gray60')+
    geom_hline(yintercept=1,colour='gray60')+
    geom_hline(yintercept=-1,colour='gray60')
  #define boxplot version
  a.binned = cut(a, 20)
  data.binned = data.frame(A = a.binned, M = m)
  q = ggplot(data=data.binned) +
    geom_boxplot(aes(A, M)) +
    theme(axis.text.x=element_text(angle=90))+
    theme(legend.position = "none")  + 
    geom_hline(yintercept=0.5,colour='gray60')+
    geom_hline(yintercept=-0.5,colour='gray60')+
    geom_hline(yintercept=1,colour='gray60')+
    geom_hline(yintercept=-1,colour='gray60')
  
  # add ylim if defined
  if (!is.na(lower.lim) | !is.na(upper.lim)){
    p = p + ylim(lower.lim, upper.lim)
    q = q + ylim(lower.lim, upper.lim)
  }
  grid.arrange(p, q, ncol = 2, top=paste0(main.title, '\n', sub.title))
}
#make MA plots for the pairs of reps
llim = -5
ulim = 5
for (i in 1:2){
  ma.plot(cts, meta, (2*i)-1, 2*i, llim, ulim)
  }

PCA

plot.pca = function(pca.results, meta.table, title){
  data = data.frame(PC1=pca.results$x[,1], PC2=pca.results$x[,2])
  data = cbind(data, meta.table)
  eigs = pca.results$sdev ** 2
  eigs = eigs / sum(eigs)
  xlab = paste0('PC1 (', format(eigs[1]*100, digits=3), '% of variance)')
  ylab = paste0('PC2 (', format(eigs[2]*100, digits=3), '% of variance)')
  ggplot(data=data, aes(x=PC1, y=PC2, color=timepoint)) +
    geom_text_repel(data=data, aes(x=PC1, y=PC2, label=id), size=3) +
    geom_point(alpha=0.6) +
    labs(title=title, x=xlab, y=ylab)
}
#calculate PCA
pca.norm = prcomp(t(cts))
#plot PCA
print(plot.pca(pca.norm, meta,'PCA, unnormalised, unfiltered'))

Incremental PCAs

#PCA with increasing abundance threshold (50,100,300,500,....,1900)
incremental.pca <- function(cts,meta){
  i=50
  expression.threshold = i
  keep.features = apply(cts, 1, max)>expression.threshold ##get genes with at least one sample above threshold
  cts.filtered = cts[keep.features==TRUE,]
  
  cts.filtered[cts.filtered<expression.threshold]=expression.threshold ##increase all below threshold to threshold
  pca.norm = prcomp(t(cts.filtered))
  print(plot.pca(pca.norm, meta, paste('PCA, unnormalised , abundance > ',i,sep='')))
  for (i in seq(100,2000,200)){
    expression.threshold = i
    keep.features = apply(cts, 1, max)>expression.threshold
    print(length(keep.features[keep.features==TRUE]))
    if (length(keep.features[keep.features==TRUE])!=0){
      cts.filtered = cts[keep.features==TRUE,]
      pca.norm = prcomp(t(cts.filtered))
      print(plot.pca(pca.norm, meta, paste('PCA, unnormalised, abundance > ',i,sep='')))
    }
  }
}
incremental.pca(cts,meta)

## [1] 12870

## [1] 10531

## [1] 9407

## [1] 8607

## [1] 7943

## [1] 7324

## [1] 6787

## [1] 6295

## [1] 5835

## [1] 5470

Jaccard similarity index

#calculate JSI for 2 sets
jaccard.index = function(a, b){
  if ((length(a) == 0) & (length(b) == 0)){
    return(1)
  } else{
    u = length(union(a,b))
    i = length(intersect(a,b))
    return(i/u)
  }
}

jaccard.heatmap = function(counts, n.abundant, labels,meta){
  colnames_counts <- paste(meta$timepoint, meta$rep, sep = "_")
  labels=labels[order(colnames_counts)]
  counts=counts[,order(colnames_counts)]
  n.samples = ncol(counts)
  hm = matrix(nrow=n.samples, ncol=n.samples)
  hm[] = 0
  #calculate JSI for each pair
  for (i in 1:n.samples){
    for (j in 1:i){
      i.gene.indices = order(counts[,i], decreasing=TRUE)[1:n.abundant]
      j.gene.indices = order(counts[,j], decreasing=TRUE)[1:n.abundant]
      
      hm[i, j] = jaccard.index(i.gene.indices, j.gene.indices)
      hm[j, i] = hm[i, j]
    }
  }
  title = paste0('Jaccard index of ', n.abundant, ' most abundant genes')
  #plot heatmap
  aheatmap(hm, color='Greys', Rowv = NA, Colv = NA, labRow=labels, labCol=labels, main=title,breaks=seq(0.5,1,0.05),treeheight=0)
}
leaf.labels=paste(meta$timepoint, meta$rep,sep='_')

#create JSI heatmap for top 50,100,200,500,1000 and 2000 genes
n.abundances = c(2000, 1000, 500, 200, 100, 50)
for (n in n.abundances){
  jaccard.heatmap(cts, n, leaf.labels,meta)
}

Noise filtering, normalisation and differential expression

Noise filtering with a fixed threshold

#pre-filtering distribution
print(summary(cts))
##    SRR7624365         SRR7624366         SRR7624371         SRR7624372      
##  Min.   :     0.0   Min.   :     0.0   Min.   :     0.0   Min.   :     0.0  
##  1st Qu.:     1.0   1st Qu.:     1.0   1st Qu.:     1.0   1st Qu.:     1.0  
##  Median :     9.0   Median :    11.0   Median :    11.0   Median :    10.0  
##  Mean   :  1252.3   Mean   :   965.8   Mean   :   951.8   Mean   :   924.3  
##  3rd Qu.:   461.8   3rd Qu.:   355.8   3rd Qu.:   333.0   3rd Qu.:   316.0  
##  Max.   :735074.0   Max.   :434806.0   Max.   :393819.0   Max.   :367895.0
expression.threshold <- 20
cts.filtered <- cts
cts.filtered[cts.filtered<expression.threshold]<-expression.threshold
cts.filtered = cts.filtered[rowSums(cts.filtered)>expression.threshold*ncol(cts.filtered),]
#post-filtering distribution
print(summary(cts.filtered))
##    SRR7624365       SRR7624366       SRR7624371       SRR7624372    
##  Min.   :    20   Min.   :    20   Min.   :    20   Min.   :    20  
##  1st Qu.:    48   1st Qu.:    50   1st Qu.:    48   1st Qu.:    44  
##  Median :   507   Median :   388   Median :   364   Median :   344  
##  Mean   :  2555   Mean   :  1969   Mean   :  1941   Mean   :  1885  
##  3rd Qu.:  2354   3rd Qu.:  1768   3rd Qu.:  1737   3rd Qu.:  1650  
##  Max.   :735074   Max.   :434806   Max.   :393819   Max.   :367895

You can instead use noisyr, either the count or transcript version, for the noise filtering (see Ilias’s section).

Quantile normalisation and edgeR DE

Quantile normalisation

#filtering
expression.threshold <- 20
cts.filtered <- cts
cts.filtered[cts.filtered<expression.threshold]<-expression.threshold
cts.filtered = cts.filtered[rowSums(cts.filtered)>expression.threshold*ncol(cts.filtered),]
#quantile normalise
cts.filtered.qnorm=data.frame(normalize.quantiles(as.matrix(cts.filtered)),row.names=rownames(cts.filtered))
colnames(cts.filtered.qnorm)=colnames(cts.filtered)
#post-normalisation summary
summary(cts.filtered.qnorm)
##    SRR7624365         SRR7624366         SRR7624371         SRR7624372      
##  Min.   :    20.0   Min.   :    20.0   Min.   :    20.0   Min.   :    20.0  
##  1st Qu.:    47.5   1st Qu.:    47.5   1st Qu.:    47.5   1st Qu.:    47.5  
##  Median :   400.8   Median :   400.8   Median :   400.5   Median :   400.8  
##  Mean   :  2087.5   Mean   :  2087.6   Mean   :  2087.6   Mean   :  2087.5  
##  3rd Qu.:  1877.4   3rd Qu.:  1877.9   3rd Qu.:  1877.2   3rd Qu.:  1877.5  
##  Max.   :482898.5   Max.   :482898.5   Max.   :482898.5   Max.   :482898.5

QC plots on quantile normalised data

#density plots
qc.plots(cts.filtered.qnorm,title = 'Filtered to s/n 20 and quantile normalised')

#PCA
pca.norm = prcomp(t(cts.filtered.qnorm))
print(plot.pca(pca.norm, meta,'PCA, quantile normalised, filtered to s/n 20'))

#MA
llim = -5
ulim = 5
for (i in 1:2){
  ma.plot(cts.filtered.qnorm, meta, (2*i)-1, 2*i, llim, ulim)
  }

DE on quantile normalised data

contrast=c(-1, 1)
design <- model.matrix(~ 0 + meta$timepoint)
edger <- DGEList(counts = cts.filtered.qnorm)
edger <- estimateDisp(edger, design)
edger.fit <- glmFit(edger, design)
edger.lrt <- glmLRT(edger.fit, contrast=contrast)
edger.table=edger.lrt$table
edger.table$adjusted.PValue=p.adjust(edger.table$PValue,method = 'BH')

#take list of DE genes for later
qnorm.edger.de = rownames(edger.table[abs(edger.table$logFC)>0.5&edger.table$adjusted.PValue<0.05,])
print(length(qnorm.edger.de))
## [1] 4370

Visualising differential expression

#Defining the function for volcano plots with edgeR output
volcano.plot.edger = function(detable, pval.threshold=0.05, lfc.threshold=0.5,
                        log10pval.cap=TRUE # caps x-axis at 10 if true
                        )
{
  # take log fold change, log10 (adjusted p-value) and log2 expression from detable
  df <- data.frame(log2FC = detable$logFC, 
                   log10pval = log10(detable$adjusted.PValue), 
                   log2.expression = detable$logCPM)

  #cap values above 10 -log10pval if param is set
  df = subset(df, !is.na(log10pval))
  if(all(df$log10pval >= -10)){log10pval.cap <- FALSE}
  if(log10pval.cap){
    df$log10pval[df$log10pval < -10] = -10
  }
  
  # take maximum absolute logFC for finite pvals to set as +- x limits
  max.abs.lfc = max(abs(df[df$log10pval > -Inf,]$log2FC))
  
  # set aside the differentially expressed entries using defined thresholds
  logp.threshold = log10(pval.threshold)
  df.de = subset(df, abs(log2FC)>lfc.threshold & log10pval<logp.threshold)
  df.de=df.de[order(df.de$log2.expression),]
  
  # colour genes by whether they above or below logFC and pval threshold
  colours = vector(length=nrow(df))
  colours[] = '#bfbfbf'
  colours[abs(df$log2FC) > lfc.threshold] = 'orange'
  colours[df$log10pval < logp.threshold] = 'red'
  colours[abs(df$log2FC) > lfc.threshold & df$log10pval < logp.threshold] = 'green'
  df$colours <- colours
  
  #define the actual plot
  volcano.plot <- ggplot() +
    geom_point(data=df, aes(x=log2FC, y=-log10pval), alpha=0.1, colour=colours) +
    xlim(-max.abs.lfc, max.abs.lfc) +
    geom_point(data=df.de, aes(x=log2FC, y=-log10pval, colour=log2.expression)) +
    scale_color_gradient(low='#99e6ff', high='#000066') +
    geom_vline(xintercept=lfc.threshold, colour="green") +
    geom_vline(xintercept=-lfc.threshold, colour="green") +
    geom_vline(xintercept=2*lfc.threshold, colour="blue") +
    geom_vline(xintercept=-2*lfc.threshold, colour="blue") +
    geom_hline(yintercept=-logp.threshold, colour="green") +
    geom_hline(yintercept=-2*logp.threshold, colour="blue")
  
  if(log10pval.cap){
    volcano.plot <- volcano.plot +
      scale_y_continuous(labels=c("0.0", "2.5", "5.0", "7.5", ">10"))
  }
  
  return(volcano.plot)
}
volcano.plot.edger(detable = edger.table,pval.threshold = 0.05,lfc.threshold = 0.5)

TMM normalisation with edgeR

expression.threshold <- 20
cts.filtered <- cts
cts.filtered[cts.filtered<expression.threshold]<-expression.threshold
cts.filtered = cts.filtered[rowSums(cts.filtered)>expression.threshold*ncol(cts.filtered),]

contrast=c(-1, 1)
design <- model.matrix(~ 0 + meta$timepoint)
cts.filtered.tmm = DGEList(counts = cts.filtered)
#this is the normalisation line
cts.filtered.tmm = calcNormFactors(cts.filtered.tmm,method = 'TMM')
cts.filtered.tmm <- estimateDisp(cts.filtered.tmm, design)
cts.filtered.tmm.fit <- glmFit(cts.filtered.tmm, design)
cts.filtered.tmm.lrt <- glmLRT(cts.filtered.tmm.fit, contrast=contrast)
cts.filtered.tmm.table <- cts.filtered.tmm.lrt$table
cts.filtered.tmm.table$adjusted.PValue=p.adjust(cts.filtered.tmm.table$PValue,method = 'BH')
TMM.edger.de = rownames(cts.filtered.tmm.table[abs(cts.filtered.tmm.table$logFC)>0.5&cts.filtered.tmm.table$adjusted.PValue<0.05,])
print(length(TMM.edger.de))
## [1] 4076
volcano.plot.edger(detable = cts.filtered.tmm.table,pval.threshold = 0.05,lfc.threshold = 0.5)

DESeq2 normalisation and DE

expression.threshold <- 20
cts.filtered <- cts
cts.filtered[cts.filtered<expression.threshold]<-expression.threshold
cts.filtered = cts.filtered[rowSums(cts.filtered)>expression.threshold*ncol(cts.filtered),]

dds <- DESeqDataSetFromMatrix(countData = cts.filtered,
                              colData = meta,
                              design = ~ timepoint)
## converting counts to integer mode
dds <- estimateSizeFactors(dds)
dds <- DESeq(dds)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(dds,lfcThreshold = 0.5,pAdjustMethod = 'BH',alpha = 0.05)
res=na.omit(res)

deseq.de=rownames(res[res$padj<0.05&abs(res$log2FoldChange)>0.5,])
print(length(deseq.de))
## [1] 1306

Visualising differentially expressed genes with DESeq2

volcano.plot.deseq = function(res, pval.threshold=0.05, lfc.threshold=0.5,
                        log10pval.cap=TRUE # caps x-axis at 10 if true
                        )
{
  # take log fold change, log10 (adjusted p-value) and log2 expression from detable
  df <- data.frame(log2FC = res$log2FoldChange, 
                   log10pval = log10(res$padj), 
                   log2.expression = log2(res$baseMean))

  #cap values above 10 -log10pval if param is set
  df = subset(df, !is.na(log10pval))
  if(all(df$log10pval >= -10)){log10pval.cap <- FALSE}
  if(log10pval.cap){
    df$log10pval[df$log10pval < -10] = -10
  }
  
  # take maximum absolute logFC for finite pvals to set as +- x limits
  max.abs.lfc = max(abs(df[df$log10pval > -Inf,]$log2FC))
  
  # set aside the differentially expressed entries using defined thresholds
  logp.threshold = log10(pval.threshold)
  df.de = subset(df, abs(log2FC)>lfc.threshold & log10pval<logp.threshold)
  df.de=df.de[order(df.de$log2.expression),]
  
  # colour genes by whether they above or below logFC and pval threshold
  colours = vector(length=nrow(df))
  colours[] = '#bfbfbf'
  colours[abs(df$log2FC) > lfc.threshold] = 'orange'
  colours[df$log10pval < logp.threshold] = 'red'
  colours[abs(df$log2FC) > lfc.threshold & df$log10pval < logp.threshold] = 'green'
  df$colours <- colours
  
  #define the actual plot
  volcano.plot <- ggplot() +
    geom_point(data=df, aes(x=log2FC, y=-log10pval), alpha=0.1, colour=colours) +
    xlim(-max.abs.lfc, max.abs.lfc) +
    geom_point(data=df.de, aes(x=log2FC, y=-log10pval, colour=log2.expression)) +
    scale_color_gradient(low='#99e6ff', high='#000066') +
    geom_vline(xintercept=lfc.threshold, colour="green") +
    geom_vline(xintercept=-lfc.threshold, colour="green") +
    geom_vline(xintercept=2*lfc.threshold, colour="blue") +
    geom_vline(xintercept=-2*lfc.threshold, colour="blue") +
    geom_hline(yintercept=-logp.threshold, colour="green") +
    geom_hline(yintercept=-2*logp.threshold, colour="blue")
  
  if(log10pval.cap){
    volcano.plot <- volcano.plot +
      scale_y_continuous(labels=c("0.0", "2.5", "5.0", "7.5", ">10"))
  }
  
  return(volcano.plot)
}
volcano.plot.deseq(res,pval.threshold = 0.05,lfc.threshold = 0.5)

Comparing sets of differentially expressed genes

Compare the MA plots

qnorm.table.de=edger.table[qnorm.edger.de,]
ggplot()+
  geom_point(data=edger.table,aes(x=logCPM,y=logFC),color='gray')+
  geom_point(data=qnorm.table.de,aes(x=logCPM,y=logFC),color='red')+
  ggtitle('Quantile normalised + edgeR')

tmm.table.de=cts.filtered.tmm.table[TMM.edger.de,]
ggplot()+
  geom_point(data=cts.filtered.tmm.table,aes(x=logCPM,y=logFC),color='gray')+
  geom_point(data=tmm.table.de,aes(x=logCPM,y=logFC),color='red')+
  ggtitle('TMM normalised + edgeR')

deseq.table.de=res[deseq.de,]
ggplot()+
  geom_point(data=as.data.frame(res),aes(x=log2(baseMean),y=log2FoldChange),color='gray')+
  geom_point(data=as.data.frame(deseq.table.de),aes(x=log2(baseMean),y=log2FoldChange),color='red')+
  ggtitle('DESeq2')

Intersection sizes for sets of DE genes

list.of.degenes=list("qnorm_edgeR" = qnorm.edger.de,
                     "TMM_edgeR" = TMM.edger.de,
                     "DESeq2"=deseq.de)

upset(fromList(list.of.degenes))

Enrichment

gprofiler_results = gprofiler2::gost(Reduce(intersect, list.of.degenes),
                                     organism='mmusculus',
                                     custom_bg = rownames(cts.filtered),
                                     sources=c('GO:BP','GO:MF','GO:CC','KEGG','REAC','TF','MIRNA'),
                                     correction_method='fdr')
## Detected custom background input, domain scope is set to 'custom'
gostplot(gprofiler_results, capped = TRUE, interactive = FALSE)

print(head(gprofiler_results$result))
##     query significant      p_value term_size query_size intersection_size
## 1 query_1        TRUE 2.368954e-47      5026       1304               611
## 2 query_1        TRUE 6.193858e-47     13537       1304              1187
## 3 query_1        TRUE 4.757336e-44      4703       1304               574
## 4 query_1        TRUE 9.083887e-44      4341       1304               542
## 5 query_1        TRUE 1.407370e-43      3994       1304               511
## 6 query_1        TRUE 4.988162e-42      3581       1304               470
##   precision    recall    term_id source                          term_name
## 1 0.4685583 0.1215678 GO:0032501  GO:BP   multicellular organismal process
## 2 0.9102761 0.0876856 GO:0008150  GO:BP                 biological_process
## 3 0.4401840 0.1220498 GO:0032502  GO:BP              developmental process
## 4 0.4156442 0.1248560 GO:0048856  GO:BP   anatomical structure development
## 5 0.3918712 0.1279419 GO:0007275  GO:BP multicellular organism development
## 6 0.3604294 0.1312483 GO:0048731  GO:BP                 system development
##   effective_domain_size source_order                parents
## 1                 17946         8831             GO:0008150
## 2                 17946         3543                       
## 3                 17946         8832             GO:0008150
## 4                 17946        15191             GO:0032502
## 5                 17946         3179 GO:0032501, GO:0048856
## 6                 17946        15076 GO:0007275, GO:0048856